from bs4 import BeautifulSoup

'''
本地网页爬虫解析
'''
with open('D:\\pythonEnvir\\SpiderP\\web\\new_index.html', 'r') as wb_data:
    soup = BeautifulSoup(wb_data, 'lxml')

    # select 里面的内容，通过网页 -》检查 -》copy -》 copy selector
    images = soup.select('body > div.main-content > ul > li > img')
    titles = soup.select('body > div.main-content > ul > li > div.article-info > h3 > a')
    rates = soup.select('body > div.main-content > ul > li > div.rate > span')
    descriptions = soup.select('div.main-content > ul > li > div.article-info > p.description')
    cate = soup.select('body > div.main-content > ul > li > div.article-info > p.meta-info')

    info = []
    for title, image, desc, rate, cat in zip(titles, images, descriptions, rates, cate):
        # stripped_strings     获取父标签下的所有子标签的文本
        data = {
            'title': title.getText(),
            'desc': desc.getText(),
            'rate': rate.getText(),
            'cate': list(cat.stripped_strings),
            'image': image.get('src')
        }
        info.append(data)

    for item in info:
        if float(item['rate']) > 3:
            print(item['rate'], item['title'])
